class TDQN:
'''Implements a TDQN-learning trader agent.'''
def __init__(self,
episodes=10,
steps=200,
seq_len=12,
batchsize=12,
alpha=0.001,
decay_rate=0.9995,
gamma=0.99,
hidden=100,
init_cash=5,
print_rate=1,
num_a=2,
mem_siz=100000,
filename='TDQN',
run=1,
save=False,
load=False,
get_output=True,
w_updates=500,
inf=True,
min_mem=0.01,
log_freq=50,
clip=1.
):
self.episodes = episodes
self.steps = steps
self.seq_len = seq_len
self.batchsize = batchsize
self.alpha = alpha
self.decay_rate = decay_rate
self.gamma = gamma
self.hidden = hidden
self.print_rate = print_rate
self.init_cash = init_cash
self.num_a = num_a
assert self.num_a % 2 == 0, 'Please enter an even action space.'
self.filename = filename
self.run = str(run)
self.save = save
self.load = load
self.get_output = get_output
self.w_updates = w_updates
self.inf = inf
self.min_mem = min_mem
self.log_freq = log_freq
self.clip = clip
self.get_rewards = []
self.test1 = []
self.test11 = []
self.test22 = []
self.test222 = []
self.test2222 = []
self.test321 = []
self.test123 = []
self.test999 = 0
self.test = None
self.cash = None
self.stock_v = None
self.stock_n = None
self.tmp_x2a = np.linspace(-10000, 10000, 100).reshape(50, 2)
self.env_start = None
self.env_count = 1
self.s_shape = X_train.shape[-1]
self.memory_size = mem_siz
#self.memory_size = int((self.episodes * self.steps) // 2)
self.memory = PrioritizedExperienceReplay(self.memory_size, self.batchsize)
self.helpers = Helpers(self.batchsize, self.num_a, self.seq_len)
def reset_env(self):
'''Resets the environment.'''
# reset the portfolio
self.cash = self.init_cash
self.stock_v = 0.
self.stock_n = 0.
self.env_count = 0
# get x2
x2a, x2b = self.get_x2()
# test mode
if self.test:
self.env_start = 0
s2 = X_test[self.env_start].reshape(1, self.s_shape)
return [s2, x2a, x2b]
# Train mode
self.env_start = np.random.randint(0, len(X_train) - (self.steps + 1))
s2 = X_train[self.env_start].reshape(1, self.s_shape)
return [s2, x2a, x2b]
def step_1(self, shift=False):
'''Provides the current price, next step and terminal.'''
t = False
self.env_count += 1
ind = self.env_start + self.env_count
# test mode
if self.test:
s2 = X_test[ind].reshape(1, self.s_shape)
p_t = test['price'][ind]
p_std_t = test['close_std'][ind]
p_std_t1 = test['close'][ind + 1]
return s2, t, p_t, p_std_t, p_std_t1
#### DATA AUGMENTATION ##########3
s2 = X_train[ind]
# add random noise
s2 = s2 + np.random.uniform(-.2, .2)
# randomize the order of features
if np.random.random() < 0.5:
np.random.shuffle(s2)
# replace with a full random
if np.random.random() < 0.01:
s2 = np.random.randn(self.s_shape)
# shift whatever we have with episodic value
if shift:
s2 = s2 + shift
s2 = s2.reshape(1, self.s_shape)
p_t = train['price'][ind]
p_std_t = train['close_std'][ind]
if ind == (self.env_start + self.steps - 2):
t = True
return s2, t, p_t, p_std_t
return s2, t, p_t, p_std_t
def step_2(self, a, p_t):
'''Computes the trade and updates the portfolio.'''
C = 0.02 # trading costs
portfolio_value = self.cash + self.stock_v
sizer, position = self.helpers.position_sizer(a)
if position: # if long
Q = np.floor((self.cash * sizer) / (p_t * (1 + C))) # measure up the long position
else: # if short
Q = -np.floor(self.stock_n * sizer)
cash = abs(self.cash - (Q * p_t) - (C * abs(Q))) # change in cash value
stock_v = (self.stock_n + Q) * p_t # change in stock value
stock_n = self.stock_n + Q # change in number of stock
r = ((cash + stock_v) - portfolio_value) / portfolio_value # portfolio return as reward
self.cash = cash
self.stock_v = stock_v
self.stock_n = stock_n
x2a, x2b = self.get_x2()
return r, x2a, x2b
def per_targets(self, model, model_2, S, A, R, S2, T, is_w):
'''Computes the per target term.'''
self.test2345 = (S, S2, A, R)
q_nexts = model.predict(S2, batch_size=self.batchsize).squeeze()
q_maxs = [np.argmax(q_nexts[i]) for i in range(self.batchsize)]
t_nexts = model_2.predict(S2, batch_size=self.batchsize).squeeze()
deltas = []
for i in range(self.batchsize):
r = R[i]
t = T[i]
if t:
deltas.append(r)
else:
deltas.append(r + (self.gamma * t_nexts[i][q_maxs[i]]))
targets = tmp = model.predict(S, batch_size=self.batchsize).squeeze()
deltas2 = []
for i in range(self.batchsize):
targets[i, A[i]] = deltas[i]
d = deltas[i] - tmp[i][A[i]]
deltas2.append(0.5 * d ** 2 if abs(d) < 1.0 else abs(d) - 0.5)
###########test
self.test5 = (q_nexts, t_nexts, tmp, targets)
return targets, deltas2
def policy(self, model, s, epsilon):
'''Generates an action given a state.'''
if np.random.random() < epsilon:
action_preference = np.random.randint(0, self.num_a)
else:
q = model.predict(s, batch_size=self.batchsize).squeeze()
max_a = []
for i, j in enumerate(q):
if j == np.amax(q):
max_a.append(i)
### Return random if list is nan
if not max_a:
action_preference = np.random.randint(0, self.num_a)
return action_preference
action_preference = np.random.choice(max_a)
return action_preference
def get_x2(self):
'generates the x2 state online.'
tmp = np.concatenate((self.tmp_x2a, np.array([[self.cash, self.stock_v]])), axis=0)
x2a = scaler.fit_transform(tmp)[-1]
x2b = np.zeros((60))
x2b[int(self.stock_n)] = 1.
return x2a.reshape(1, 2), x2b.reshape(1, 60)
def unpack(self, transitions):
'''Gets a batch from memory and slices it.'''
S = [transitions[i][0] for i in range(self.batchsize)]
A = np.array([transitions[i][1] for i in range(self.batchsize)])
R = np.array([transitions[i][2] for i in range(self.batchsize)])
S2 = [transitions[i][3] for i in range(self.batchsize)]
T = np.array([transitions[i][4] for i in range(self.batchsize)])
# stack respective input in S and S2
x1_s, x1_s2 = S[0][0], S2[0][0]
x2a_s, x2a_s2 = S[0][1], S2[0][1]
x2b_s, x2b_s2 = S[0][2], S2[0][2]
for i, j in zip(S[1:], S2[1:]):
#########test
self.test4 = (S, x1_s, x1_s2, i, j)
x1_s, x1_s2 = np.concatenate((x1_s, i[0])), np.concatenate((x1_s2, j[0]))
x2a_s, x2a_s2 = np.concatenate((x2a_s, i[1])), np.concatenate((x2a_s2, j[1]))
x2b_s, x2b_s2 = np.concatenate((x2b_s, i[2])), np.concatenate((x2b_s2, j[2]))
S, S2 = [x1_s, x2a_s, x2b_s], [x1_s2, x2a_s2, x2b_s2]
return S, A, R, S2, T
def train(self):
'''Trains selected RL agent.'''
epsilon = 1
if self.load:
epsilon = 0.1
delta = 1
Actions = {i: 0 for i in range(self.num_a)}
dists = {'cash': [], 'stock_v': [], 'stock_n': []}
P_value = []
avg_loss = []
avg_c = []
avg_v = []
avg_n = []
p_value = []
# define loss and optimizer objects
loss_object = tf.keras.losses.Huber()
optimizer = tf.keras.optimizers.Adam(learning_rate=self.alpha) #clipnorm=self.clip
self.helpers.tensorboard_writer(logdir='logs/' + self.filename + str(self.run))
# build model
if self.load:
model = load_model(
'saved/' + self.filename + '/' + str(self.run) + '/model_' + str(episode))
model2 = load_model(
'saved/' + self.filename + '/' + str(self.run) + '/model2_' + str(episode))
self.memory.sum_tree.data = np.load(
'saved/' + self.filename + '/' + str(self.run) + '/data_' + str(episode),
allow_pickle=True)
self.get_output = False
else:
# X1 stream
x1_in = Input(shape=(self.s_shape,))
x1 = Dense(250)(x1_in)
l1 = LeakyReLU()(x1)
bn1 = BatchNormalization()(l1)
# X2a stream
x2a_in = Input(shape=(2,))
x2 = Dense(100)(x2a_in)
l2 = LeakyReLU()(x2)
bn2 = BatchNormalization()(l2)
# X2b stream
x2b_in = Input(shape=(60,))
x3 = Dense(50)(x2b_in)
l3 = LeakyReLU()(x3)
bn3 = BatchNormalization()(l3)
# Concat layers
c = Concatenate()([bn1, bn2, bn3])
# Layer 2
x4 = Dense(300)(c)
l4 = LeakyReLU()(x4)
bn4 = BatchNormalization()(l4)
# Output layer
x_out = Dense(self.num_a, activation='linear')(bn4)
layer_outputs = [x1, l1, bn1, x2, l2, bn2, x3, l3, bn3, x4, l4, bn4, x_out]
model = Model([x1_in, x2a_in, x2b_in], x_out)
model_2 = Model([x1_in, x2a_in, x2b_in], x_out)
model.compile(optimizer=optimizer, loss=loss_object)
self.testmodel = model
self.testmodel2 = model_2
# EPISODES
for episode in range(self.episodes):
# reset metrics
actions = []
temp_trans = []
# data augment shift value
shift = np.random.uniform(-0.25, 0.25)
# reset environment
s = self.reset_env()
# weight update
if episode % self.w_updates == 0:
weights = model.get_weights()
model_2.set_weights(weights)
# epsilon decay
if epsilon > 0.1:
epsilon = epsilon * self.decay_rate**episode
else:
epsilon = 0.1
# STEPS
for step in range(2500): # breaks when terminal
# frame skipping
#if step % 4 == 0:
a = self.policy(model, s, epsilon)
s2_x1, t, p_t, p_std_t = self.step_1(shift)
r, x2a, x2b = self.step_2(a, p_t)
s2 = [s2_x1, x2a, x2b]
# TRAINING
if self.memory.len_memory() > (self.min_mem * self.memory_size):
# get transition
transitions, idxs, is_w = self.memory.sample_transition()
# unpack transition
S, A, R, S2, T = self.unpack(transitions)
# get targets
targets, deltas = self.per_targets(model, model_2, S, A, R, S2, T, is_w)
# optimize
loss = model.train_on_batch(x=S, y=targets, sample_weight=is_w)
# collect scalars
avg_loss.append(loss)
avg_c.append(self.cash)
avg_v.append(self.stock_v)
avg_n.append(self.stock_n)
p_value.append(self.cash + self.stock_v)
actions.append(a)
if t:
break
#### reward clipping #####
if r > 0:
r = 1.
elif r < 0:
r = -1.
# store transition
self.memory.store_transition(delta, (s, a, r, s2, t, p_t))
s = s2
############# tensorboard 1 ##########################
if episode % (self.episodes // self.log_freq) == 0:
# compute gradients
weights = model.trainable_weights
grads = K.gradients(model.output, weights)
f = K.function([model.input], grads)
eval_grads = f([s])
mean_eval_grads = np.mean([np.mean(g) for g in eval_grads])
# write scalars to tensorboard each episode
self.helpers.tensorboard_scalar('loss', np.mean(avg_loss), episode)
self.helpers.tensorboard_scalar('score', np.mean(p_value), episode)
self.helpers.tensorboard_scalar('grads', mean_eval_grads, episode)
# write gardient histograms to tensorboard
for w, g in zip(weights, eval_grads):
self.helpers.tensorboard_hist(w.name + 'grads_', g, episode)
# write weights to tensorboard
for layer in model.layers[1:]:
for i, value in enumerate(layer.get_weights()):
tag = layer.weights[i].name
self.helpers.tensorboard_hist(tag, value, episode)
if self.get_output:
# write outputs to tensorboard
for i, layer_output in enumerate(layer_outputs):
output = Model(
[x1_in, x2a_in, x2b_in], layer_output).predict(s, batch_size=self.batchsize)
self.helpers.tensorboard_hist('layer_output/' + str(i), output, episode)
# dists
dists['cash'].append(np.mean(avg_c))
dists['stock_v'].append(np.mean(avg_v))
dists['stock_n'].append(np.mean(avg_n))
P_value.append(np.mean(p_value))
# actions
for key in Counter(actions).keys():
Actions[key] += Counter(actions)[key]
actions, avg_loss, p_value, avg_c, avg_v, avg_n, = [], [], [], [], [], []
# PLOT
if (episode + 1) % (self.episodes // self.print_rate) == 0:
print(datetime.now())
self.helpers.plot_train(episode, dists, P_value, Actions, self.filename, self.run)
# inference
if self.inf:
self.inference(model, episode)
if self.save:
dir_path = 'saved/' + self.filename + '/' + str(self.run)
file_path_model = '{}/model_' + str(episode)
file_path_model_2 = '{}/model2_' + str(episode)
file_path_data = '{}/data' + str(episode)
self.helpers.mkdir_p(dir_path)
model.save(file_path_model.format(dir_path))
model_2.save(file_path_model_2.format(dir_path))
np.save(file_path_data.format(dir_path), self.memory.sum_tree.data)
def inference(self, model, episode):
'''Tests the agent upon the full period.'''
self.test = True
s = self.reset_env()
Actions = {i: 0 for i in range(self.num_a)}
actions = []
Score2 = []
Score = []
score = 0
p_value = []
Q_values = {i: [] for i in range(self.num_a)}
for step in range(len(X_test) - 2):
a = self.policy(model, s, epsilon=0)
s2_x1, _, p_t, _, p_t1 = self.step_1()
r, x2a, x2b = self.step_2(a, p_t)
s2 = [s2_x1, x2a, x2b]
p_value.append(self.cash + self.stock_v)
score += r
q_values = model.predict(s).squeeze()
for ind, value in enumerate(q_values):
Q_values[ind].append(value)
for ind, value in enumerate(q_values):
Q_values[ind].append(value)
if a == 0:
if p_t1 > 0:
Score2.append(0)
else:
Score2.append(2)
elif a == 1:
if p_t1 < 0:
Score2.append(1)
else:
Score2.append(2)
Score.append(score)
actions.append(a)
s = s2
for key in Counter(actions).keys():
Actions[key] += Counter(actions)[key]
buy, opt, buy_p, opt_p = self.helpers.optimum(test, self.init_cash)
self.helpers.plot_inference(data, p_value, Score, Score2, Q_values,
Actions, buy, opt, buy_p, opt_p, self.filename, self.run, episode)
self.test = False
def train_multiple(self):
'''Trains multiple variations of agents.'''
run = 1
run_data = (150, 300)
run_data_1 = ()
for i in run_data:
self.run = str(run)
print('*'*65, '\n', 'run:', run, 'time:', datetime.now()) ####3 note i and j
# Run variables #
self.hidden = i
self.episodes = run_data_1[i]
# Memory reset
self.memory = PrioritizedExperienceReplay(self.memory_size, self.batchsize)
################
self.train()
K.clear_session()
run += 1